################################
#	John Henderson
#	Gerrymandering Incumbency 
#		(with Brian Hamel and Aaron Goldzimer)
#		January 1, 2018
#
################################
# rvest to get percentages ...
# last run was: 2/4/17
# do not run

rm(list=ls())
library(stringr)
library(rvest)
years=2000+0:8*2

states=read.csv('~/Dropbox/StateRedistricting/replication/short/state_abbrev.txt',sep='\t',stringsAsFactors=F)

#states,states_name
cnts=1

state_tab=matrix(NA,nrow(states),5)
year_outs=list()
for(year in 1:length(years)){

	url=paste("https://en.wikipedia.org/wiki/United_States_House_of_Representatives_elections,_",years[year],sep='') 
	obs=read_html(url)
	tables = html_nodes(obs,"table")

	for(k in 1:nrow(states)){
		state_tab[k,1]=grep(tables,pattern=states[k,1])[1]
		state_tab[k,2]=grep(tables,pattern=states[k,1])[2]
		state_tab[k,3]=grep(tables,pattern=states[k,1])[3]
		state_tab[k,4]=grep(tables,pattern=states[k,1])[4]
		state_tab[k,4]=grep(tables,pattern=states[k,1])[5]
	}

	# 14 is AK
	state_mat=matrix(NA,50,3)
	for(i in 1:nrow(states)){
		un_st=(state_tab[i,])
		ik=0
		iters=0
		while(ik==0){
			iters=iters+1
			if(is.na(un_st[iters]) | length(un_st[iters])==0){
				state_tab[i,5]=state_tab[i-1,iters_old]+1
				un_st[5]=state_tab[i,5]
				iters=5
				tab2 = html_table(tables[un_st[iters]],fill=T)
				ik=as.numeric(str_sub(tab2[[1]][1,1],1,str_length(states[i,1]))==states[i,1])
				break()
			} else {
				tab2 = html_table(tables[un_st[iters]],fill=T)
				ik=any(as.numeric(str_sub(tab2[[1]][,1],1,str_length(states[i,1]))==states[i,1])==1)
			}
		}

		xR=str_sub(tab2[[1]][,6],str_locate(gsub(tab2[[1]][,6],pattern='[)]',replace=']'),"R]")[,2])
		xR=as.numeric(str_sub(xR,3,str_locate(xR,pattern='%')[,1]-1))

		xD=str_sub(tab2[[1]][,6],str_locate(gsub(tab2[[1]][,6],pattern='[)]',replace=']'),"D]")[,2])
		xD=as.numeric(str_sub(xD,3,str_locate(xD,pattern='%')[,1]-1))

		xR[which(is.na(xD))]=NA
		xD[which(is.na(xR))]=NA
	
		corrs=NA
		if(length(which(!is.na(xR)))>2){
			corrs=cor.test(xD,xR)$est
		}
		# R vote ; D vote
		state_mat[i,1:3]=c(mean(xR,na.rm=T),mean(xD,na.rm=T),corrs)		
		iters_old=iters
	}
	rownames(state_mat)=states[,2]
	year_outs[[year]]=state_mat
}

year_mins=year_maxs=year_outs[[1]]
#year_outs[[1]] to
#year_outs[[6]]
for(k in 2:6){
	
	year_mins[which(year_outs[[k]][,1]<year_mins[,1]),1]=year_outs[[k]][which(year_outs[[k]][,1]<year_mins[,1]),1]
	year_mins[which(year_outs[[k]][,2]<year_mins[,2]),2]=year_outs[[k]][which(year_outs[[k]][,2]<year_mins[,2]),2]
	year_maxs[which(year_outs[[k]][,1]>year_maxs[,1]),1]=year_outs[[k]][which(year_outs[[k]][,1]>year_maxs[,1]),1]
	year_maxs[which(year_outs[[k]][,2]>year_maxs[,2]),2]=year_outs[[k]][which(year_outs[[k]][,2]>year_maxs[,2]),2]
}

colnames(year_maxs)=colnames(year_mins)=c('Rep08','Dem08','Cor')
year_maxs=year_maxs[,1:2]
year_mins=year_mins[,1:2]
save(year_maxs,year_mins,file='~/Dropbox/StateRedistricting/replication/short/stateSwings.Rdata')

#end